In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from pathlib import Path
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../../")
DATA_PATH = Path('../../../data/')
MODEL_PATH = Path('../../../models/keras2lwtnn')
In [4]:
import cufflinks as cf
import plotly.offline as py
py.init_notebook_mode()
cf.go_offline()
import plotly.graph_objs as go
In [5]:
import os

Data

In [6]:
k = 1
sampled_data = pd.read_hdf(DATA_PATH / 'train_data.h5', 'train_set').sample(frac=k, random_state=137)
In [7]:
unused_features = [
     'seed_nbIT', 'seed_nLayers', 'seed_mva_value', 'seed_nLHCbIDs',
    'is_downstream_reconstructible_not_electron', 'is_true_seed',
    'has_MCParticle_not_electron', 'has_MCParticle'
]
data = sampled_data.drop(unused_features, axis=1)
data = data.astype(np.float32)

data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690640 entries, 942921 to 2219110
Data columns (total 8 columns):
is_downstream_reconstructible    1690640 non-null float32
seed_chi2PerDoF                  1690640 non-null float32
seed_p                           1690640 non-null float32
seed_pt                          1690640 non-null float32
seed_x                           1690640 non-null float32
seed_y                           1690640 non-null float32
seed_tx                          1690640 non-null float32
seed_ty                          1690640 non-null float32
dtypes: float32(8)
memory usage: 64.5 MB
In [8]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=137)
label_names = [
    'is_downstream_reconstructible'
]
In [9]:
x_train = train_set.drop(label_names, axis=1)
y_train = train_set[label_names].copy().astype(np.int32)

x_test = test_set.drop(label_names, axis=1)
y_test = test_set[label_names].copy().astype(np.int32)
In [10]:
x_test.head()
Out[10]:
seed_chi2PerDoF seed_p seed_pt seed_x seed_y seed_tx seed_ty
1230524 1.224318 24853.207031 4751.315918 1218.260376 57.654774 0.194639 0.007061
416777 2.310915 14115.601562 1086.456421 174.162796 14.526168 0.077169 0.002084
1438292 6.570617 140397.859375 32186.728516 1750.810547 366.384735 0.219562 0.085235
2222976 0.769758 2739.445557 1166.308960 -1143.220337 80.898964 -0.470408 0.010258
2014207 1.118085 2894.345459 1438.781494 -1823.896362 -236.162720 -0.571847 -0.034705
In [11]:
from data_pipeline import data_pipeline_all_labels as data_pipeline
pipeline = data_pipeline()
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
In [12]:
x_test.head()
Out[12]:
seed_chi2PerDoF seed_p seed_pt seed_x seed_y seed_tx seed_ty seed_angle seed_pr seed_r
1230524 -0.445808 1.156707 2.759277 1.716483 0.119556 0.642654 0.113544 0.064640 -0.237567 0.978409
416777 0.515141 0.612609 -0.195430 0.245156 0.036956 0.259701 0.039830 0.113133 -0.869259 -1.036606
1438292 2.938306 2.822062 6.590351 2.466945 0.710833 0.723903 1.271406 0.279274 -0.021398 2.075946
2222976 -0.978061 -0.964290 -0.053407 -1.611282 0.164073 -1.525406 0.160899 -0.094564 1.179145 0.836577
2014207 -0.559849 -0.911387 0.367025 -2.570481 -0.443162 -1.856100 -0.505055 0.174625 1.671531 2.173117
In [13]:
x_train.sample(frac=0.01).iplot(kind='hist', bins=50)
In [14]:
data_renamed_columns = {column:column.replace("_", "\_") for column in x_train.columns}
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
x_train.rename(columns=data_renamed_columns).hist(bins=50, figsize=(15,15))
# plt.savefig('features_scaled_hist.eps')
Out[14]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b781df60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7578b38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7b3bb70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b78664a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b78fe9e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7534128>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b71f09e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7b005f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7abd630>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7a84588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b7a45470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b79906a0>]], dtype=object)
In [15]:
x_train.hist(bins=50, figsize=(10,10))
Out[15]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b71272e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b75b9320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b56ed400>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b569e7f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b5661400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b5618438>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b55d6550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b5594278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b555d4a8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b5526978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b54e1f98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f98b5431a90>]], dtype=object)
In [16]:
x_train.mean()
Out[16]:
seed_chi2PerDoF    6.351077e-07
seed_p            -1.311259e-06
seed_pt            1.804929e-06
seed_x            -9.468086e-09
seed_y            -2.489535e-09
seed_tx           -1.257864e-08
seed_ty            7.083393e-09
seed_angle         1.408684e-08
seed_pr           -5.485245e-08
seed_r            -7.954728e-08
dtype: float32
In [17]:
x_test.mean()
Out[17]:
seed_chi2PerDoF    0.000282
seed_p            -0.000365
seed_pt            0.002361
seed_x            -0.001611
seed_y             0.000366
seed_tx           -0.001950
seed_ty            0.000219
seed_angle         0.000350
seed_pr            0.001221
seed_r             0.001036
dtype: float32
In [18]:
# x_test.sample(frac=0.05).iplot(kind='hist', bins=50)
In [19]:
y_train.hist()
Out[19]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f98b53c7cf8>]], dtype=object)
In [20]:
# y_test.hist()
In [21]:
from sklearn.utils import class_weight

class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train['is_downstream_reconstructible']), y_train['is_downstream_reconstructible'])
In [ ]:
from keras.callbacks import TensorBoard
from models import KerasDNN
import time
import keras


date_created = time.ctime()

DNNclf = KerasDNN(
    (x_train.shape[1],),
    (1,),
    neurons=124,
    layers=10,
    dropout=0.05,
    loss_metric='binary_crossentropy',
    metrics=['accuracy'],
    last_layer_act='sigmoid',
    kernel_initializer='he_normal',
    optimizer='adam',
    batch_norm=False,
    activation='selu',
)
Using TensorFlow backend.
In [ ]:
DNNclf.fit(
    x_train.values, y_train.values,
    epochs=200,
    validation_data=(x_test.values, y_test.values),
    class_weight=class_weight,
    batch_size=1000,
    callbacks=[
        TensorBoard(
            log_dir=f'./logs/{date_created}',
            histogram_freq=1,
            batch_size=1000,
            write_graph=True,
            write_grads=True,
        )
    ]
)
Train on 1352512 samples, validate on 338128 samples
Epoch 1/200
1352512/1352512 [==============================] - 104s - loss: 0.4197 - acc: 0.8136 - val_loss: 0.3547 - val_acc: 0.8491
Epoch 2/200
1352512/1352512 [==============================] - 96s - loss: 0.3666 - acc: 0.8431 - val_loss: 0.3448 - val_acc: 0.8588
Epoch 3/200
1352512/1352512 [==============================] - 94s - loss: 0.3537 - acc: 0.8501 - val_loss: 0.3360 - val_acc: 0.8591
Epoch 4/200
1352512/1352512 [==============================] - 99s - loss: 0.3460 - acc: 0.8542 - val_loss: 0.3233 - val_acc: 0.8661
Epoch 5/200
1352512/1352512 [==============================] - 90s - loss: 0.3400 - acc: 0.8570 - val_loss: 0.3241 - val_acc: 0.8644
Epoch 6/200
1352512/1352512 [==============================] - 90s - loss: 0.3357 - acc: 0.8598 - val_loss: 0.3240 - val_acc: 0.8661
Epoch 7/200
1352512/1352512 [==============================] - 113s - loss: 0.3321 - acc: 0.8612 - val_loss: 0.3173 - val_acc: 0.8684
Epoch 8/200
1352512/1352512 [==============================] - 97s - loss: 0.3296 - acc: 0.8624 - val_loss: 0.3177 - val_acc: 0.8687
Epoch 9/200
1352512/1352512 [==============================] - 99s - loss: 0.3268 - acc: 0.8644 - val_loss: 0.3128 - val_acc: 0.8705
Epoch 10/200
1352512/1352512 [==============================] - 91s - loss: 0.3247 - acc: 0.8652 - val_loss: 0.3175 - val_acc: 0.8712
Epoch 11/200
1352512/1352512 [==============================] - 106s - loss: 0.3233 - acc: 0.8658 - val_loss: 0.3122 - val_acc: 0.8712
Epoch 12/200
1352512/1352512 [==============================] - 99s - loss: 0.3217 - acc: 0.8666 - val_loss: 0.3103 - val_acc: 0.8732
Epoch 13/200
1352512/1352512 [==============================] - 97s - loss: 0.3200 - acc: 0.8673 - val_loss: 0.3070 - val_acc: 0.8738
Epoch 14/200
1352512/1352512 [==============================] - 103s - loss: 0.3187 - acc: 0.8684 - val_loss: 0.3086 - val_acc: 0.8727
Epoch 15/200
1352512/1352512 [==============================] - 92s - loss: 0.3176 - acc: 0.8690 - val_loss: 0.3081 - val_acc: 0.8752
Epoch 16/200
1352512/1352512 [==============================] - 93s - loss: 0.3168 - acc: 0.8692 - val_loss: 0.3093 - val_acc: 0.8732
Epoch 17/200
1352512/1352512 [==============================] - 91s - loss: 0.3154 - acc: 0.8697 - val_loss: 0.3112 - val_acc: 0.8740
Epoch 18/200
1352512/1352512 [==============================] - 130s - loss: 0.3147 - acc: 0.8704 - val_loss: 0.3035 - val_acc: 0.8768
Epoch 19/200
1352512/1352512 [==============================] - 107s - loss: 0.3136 - acc: 0.8708 - val_loss: 0.3051 - val_acc: 0.8761
Epoch 20/200
1352512/1352512 [==============================] - 101s - loss: 0.3129 - acc: 0.8710 - val_loss: 0.3019 - val_acc: 0.8768
Epoch 21/200
1352512/1352512 [==============================] - 97s - loss: 0.3119 - acc: 0.8716 - val_loss: 0.3038 - val_acc: 0.8767
Epoch 22/200
1352512/1352512 [==============================] - 111s - loss: 0.3115 - acc: 0.8720 - val_loss: 0.3030 - val_acc: 0.8763
Epoch 23/200
1352512/1352512 [==============================] - 107s - loss: 0.3109 - acc: 0.8722 - val_loss: 0.3012 - val_acc: 0.8764
Epoch 24/200
1352512/1352512 [==============================] - 108s - loss: 0.3101 - acc: 0.8726 - val_loss: 0.3017 - val_acc: 0.8765
Epoch 25/200
1352512/1352512 [==============================] - 113s - loss: 0.3098 - acc: 0.8727 - val_loss: 0.3036 - val_acc: 0.8766
Epoch 26/200
1352512/1352512 [==============================] - 107s - loss: 0.3094 - acc: 0.8729 - val_loss: 0.3044 - val_acc: 0.8757
Epoch 27/200
1352512/1352512 [==============================] - 96s - loss: 0.3085 - acc: 0.8733 - val_loss: 0.3003 - val_acc: 0.8781
Epoch 28/200
1352512/1352512 [==============================] - 104s - loss: 0.3080 - acc: 0.8737 - val_loss: 0.2994 - val_acc: 0.8774
Epoch 29/200
1352512/1352512 [==============================] - 110s - loss: 0.3083 - acc: 0.8735 - val_loss: 0.2982 - val_acc: 0.8792
Epoch 30/200
1352512/1352512 [==============================] - 101s - loss: 0.3074 - acc: 0.8740 - val_loss: 0.2974 - val_acc: 0.8793
Epoch 31/200
1352512/1352512 [==============================] - 103s - loss: 0.3069 - acc: 0.8742 - val_loss: 0.2987 - val_acc: 0.8784
Epoch 32/200
1352512/1352512 [==============================] - 109s - loss: 0.3068 - acc: 0.8744 - val_loss: 0.3001 - val_acc: 0.8777
Epoch 33/200
1352512/1352512 [==============================] - 100s - loss: 0.3061 - acc: 0.8746 - val_loss: 0.2988 - val_acc: 0.8779
Epoch 34/200
1352512/1352512 [==============================] - 108s - loss: 0.3057 - acc: 0.8746 - val_loss: 0.2949 - val_acc: 0.8798
Epoch 35/200
1352512/1352512 [==============================] - 105s - loss: 0.3054 - acc: 0.8750 - val_loss: 0.3010 - val_acc: 0.8764
Epoch 36/200
1352512/1352512 [==============================] - 105s - loss: 0.3055 - acc: 0.8752 - val_loss: 0.2983 - val_acc: 0.8771
Epoch 37/200
1352512/1352512 [==============================] - 112s - loss: 0.3048 - acc: 0.8754 - val_loss: 0.2954 - val_acc: 0.8790
Epoch 38/200
1352512/1352512 [==============================] - 103s - loss: 0.3044 - acc: 0.8755 - val_loss: 0.2950 - val_acc: 0.8801
Epoch 39/200
1352512/1352512 [==============================] - 103s - loss: 0.3045 - acc: 0.8755 - val_loss: 0.2958 - val_acc: 0.8794
Epoch 40/200
1352512/1352512 [==============================] - 108s - loss: 0.3045 - acc: 0.8752 - val_loss: 0.2984 - val_acc: 0.8802
Epoch 41/200
1352512/1352512 [==============================] - 109s - loss: 0.3041 - acc: 0.8758 - val_loss: 0.2944 - val_acc: 0.8798
Epoch 42/200
1339000/1352512 [============================>.] - ETA: 0s - loss: 0.3036 - acc: 0.8759- ETA: 1s - loss: 0.3037
In [ ]:
history = DNNclf.model.history
In [ ]:
DNNclf.model.summary(line_length=100)
In [ ]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.savefig('loss.eps')
In [ ]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
# plt.savefig('acc.eps')

Quick evaluation

In [ ]:
predictions = DNNclf.predict_proba(x_test.values)
In [ ]:
predictions_df = pd.DataFrame(predictions, columns=y_test.columns, index=y_test.index)
In [ ]:
predictions_df.head()
In [ ]:
from sklearn.metrics import accuracy_score
In [ ]:
%time accuracy_score(y_train.values,  DNNclf.predict_proba(x_train.values) > 0.5)
In [ ]:
%time accuracy_score(y_test.values, predictions_df.values> 0.5)
In [ ]:
from sklearn.metrics import roc_auc_score
In [ ]:
roc_auc_score(y_train.values, DNNclf.predict_proba(x_train.values))
In [ ]:
roc_auc_score(y_test.values, predictions_df.values)
In [ ]:
from sklearn.metrics import log_loss
In [ ]:
log_loss(y_train.values, DNNclf.predict_proba(x_train.values))
In [ ]:
log_loss(y_test.values, DNNclf.predict_proba(x_test.values))
In [ ]:
from utils import plot_roc_curve, plot_true_positives_and_negatives, plot_confusion_matrix
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
In [ ]:
# test
plot_confusion_matrix(
    confusion_matrix(y_test.values, np.array(predictions_df.values > 0.5, dtype=np.int32)),
    classes=["ghost","seed"],
    title='Normalized confusion matrix - DNN',
    normalize=True
)
In [ ]:
# train
plot_confusion_matrix(
    confusion_matrix(y_train.values, np.array(DNNclf.model.predict(x_train.values) > 0.5, dtype=np.int32)),
    classes=["ghost","seed"],
    title='Normalized confusion matrix - DNN',
    normalize=True
)
In [ ]:
%autoreload 2
def plot_true_positives_and_negatives(
        y_true,
        probabilities,
        normalize=False,
        step=0.1,
        title='True positives and true negatives vs threshold', ):

    thresholds = np.arange(0.0, 1.0, step)
    true_positives_rate = np.empty(thresholds.shape)
    true_negatives_rate = np.empty(thresholds.shape)
    for i, threshold in enumerate(thresholds):
        classified_examples = np.array(
            probabilities > threshold, dtype=int)
        cm = confusion_matrix(y_true, classified_examples)
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        true_positives_rate[i] = cm[1, 1]
        true_negatives_rate[i] = cm[0, 0]

    plots = [
        go.Scatter(x=thresholds, y=true_positives_rate, name='true positives'),
        go.Scatter(x=thresholds, y=true_negatives_rate, name='true negatives'),
    ]
    layout = go.Layout(
        title=title,
        xaxis=dict(title='threshold'),
    )
    fig = go.Figure(data=plots, layout=layout)

    py.iplot(fig)
    return true_positives_rate, true_negatives_rate, thresholds
In [ ]:
# train
true_positives_rate_tr, true_negatives_rate_tr, thresholds = plot_true_positives_and_negatives(
    y_train.values, DNNclf.model.predict(x_train.values),
    title='Thresholds - DNN',
    step=1e-2,
    normalize=True
)
In [ ]:
# test
true_positives_rate, true_negatives_rate, thresholds = plot_true_positives_and_negatives(
    y_test.values, predictions_df.values,
    title='Thresholds - DNN',
    step=5e-2,
    normalize=True
)
In [ ]:
plt.figure(figsize=(16,9))
plt.plot(thresholds, true_positives_rate_tr)
plt.plot(thresholds, true_negatives_rate_tr)
plt.plot(thresholds, true_positives_rate)
plt.plot(thresholds, true_negatives_rate)
plt.xlabel('Threshold')
plt.legend(['TP rate train', 'TN rate train', 'TP rate validation', 'TN rate validation'], loc='lower right')
In [ ]:
true_positives_rate, true_negatives_rate, thresholds
In [ ]:
x_train.mean()
In [ ]:
x_test.mean()
In [ ]:
predictions_df['correct'] = np.array( np.equal(y_test.values, predictions_df.values > 0.5))

PIpeline info

In [ ]:
pipeline.named_steps
In [ ]:
def get_pipeline_params(pipeline):
    d = {}
    for name, step in pipeline.named_steps.items():
        try:
            d[step] = {'shift': -step.scaler.mean_[0], 'scale': 1.0/step.scaler.scale_[0]}
        except Exception as e:
                pass
            
    return d
In [ ]:
get_pipeline_params(pipeline)

Saving model

In [ ]:
model_arch = DNNclf.model.to_json()
In [ ]:
model_dir = (MODEL_PATH / date_created).as_posix()
os.mkdir(model_dir)
In [ ]:
with (MODEL_PATH / date_created / 'architecture.json').open('w') as arch_file:
    arch_file.write(model_arch)
In [ ]:
DNNclf.model.save_weights(MODEL_PATH / date_created / 'weights.h5')
In [ ]:
print(date_created)